library(tidyverse)
── Attaching core tidyverse packages ─────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.0── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks plotly::filter(), stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
# read data
data <- read.csv('data/2018_Data.csv')
print(data)
View(data)
# Read required columns file (column)
column <- read.csv('data/2018_MetaData.csv')
# select the required columns from the total data
main_df <- select(data, column$Column_Name)
print(main_df)
main_df <- main_df[-c(1),]
rownames(main_df) <- 1:nrow(main_df)
print(main_df)
main_df <- main_df %>%
rename_at(vars(GEO_ID:DP05_0089E), ~ column$Label_Shortcut) %>%
separate_wider_delim(NAME, ",", names = c('COUNTY', 'STATE'))
main_df$COUNTY <- gsub(" County", "", as.character(main_df$COUNTY))
print(main_df)
# Check Null values
sum(is.null(main_df))
[1] 0
sapply(main_df, class)
GEO_ID COUNTY
"character" "character"
STATE SA_TOTAL_POPULATION
"character" "character"
SA_TP_MALE SA_TP_FEMALE
"character" "character"
SA_TP_SEX_RATIO SA_TP_UNDER_5
"character" "character"
SA_TP_5_TO_9 SA_TP_10_TO_14
"character" "character"
SA_TP_15_TO_19 SA_TP_20_TO_24
"character" "character"
SA_TP_25_TO_34 SA_TP_35_TO_44
"character" "character"
SA_TP_45_TO_54 SA_TP_55_TO_59
"character" "character"
SA_TP_60_TO_64 SA_TP_65_TO_74
"character" "character"
SA_TP_75_TO_84 SA_TP_85_ABOVE
"character" "character"
SA_TP_MEDIAN_AGE SA_TP_UNDER_18
"character" "character"
SA_TP_16_ABOVE SA_TP_18_ABOVE
"character" "character"
SA_TP_21_ABOVE SA_TP_62_ABOVE
"character" "character"
SA_TP_65_ABOVE SA_TP_18_ABOVE_MALE
"character" "character"
SA_TP_18_ABOVE_FEMALE SA_TP_18_ABOVE_SEX_RATIO
"character" "character"
SA_TP_65_ABOVE_MALE SA_TP_65_ABOVE_FEMALE
"character" "character"
SA_TP_65_ABOVE_SEX_RATIO RACE_TOTAL_POP
"character" "character"
RACE_TP_1_RACE RACE_TP_2_OR_MORE_RACES
"character" "character"
RACE_TP_1_WHITE RACE_TP_1_BLK_AFR_AMER
"character" "character"
RACE_TP_1_AMER_IND_ALAS_NAT RACE_TP_1_AIAN_CHEROKEE
"character" "character"
RACE_TP_1_AIAN_CHIPPEWA RACE_TP_1_NAVAJO
"character" "character"
RACE_TP_1_SIOUX RACE_TP_1_ASIAN
"character" "character"
RACE_TP_1_ASIAN_INDIAN RACE_TP_1_ASIAN_CHINESE
"character" "character"
RACE_TP_1_ASIAN_JAPANESE RACE_TP_1_ASIAN_FILIPINO
"character" "character"
RACE_TP_1_ASIAN_KOREAN RACE_TP_1_ASIAN_VIETNAMESE
"character" "character"
RACE_TP_1_ASIAN_OTHER RACE_TP_1_NAT_HAW_OTH_PAC_ISD
"character" "character"
RACE_TP_1_NHOPI_NAT_HAWAIIAN RACE_TP_1_NHOPI_GUAMANIAN_CHAMORRO
"character" "character"
RACE_TP_1_NHOPI_SAMOAN RACE_TP_1_NHOPI_OTH_PAC_ISD
"character" "character"
RACE_TP_1_SOME_OTH_RACE RACE_TP_2_MORE_WHITE_BLK_AFR_AMERT
"character" "character"
RACE_TP_2_MORE_WHITE_AMER_IND_ALAS_NAT RACE_TP_2_MORE_WHITE_ASIAN
"character" "character"
RACE_TP_2_MORE_BLK_AFR_AMER_AMER_INDIAN_ALAS_NAT RACE_ALONE_1_2_MORE_TOTAL_POP
"character" "character"
RACE_ALONE_1_2_MORE_TP_WHITE RACE_ALONE_1_2_MORE_TP_BLK_AFR_AMER
"character" "character"
RACE_ALONE_1_2_MORE_TP_AMER_IND_ALAS_NAT RACE_ALONE_1_2_MORE_TP_ASIAN
"character" "character"
RACE_ALONE_1_2_MORE_TP_NAT_HAW_OTH_PAC_ISD RACE_ALONE_1_2_MORE_TP_SOME_OTH_RACE
"character" "character"
HISPANIC_LATINO_RACE_TOTAL_POP HL_RACE_HISPANIC_LATINO
"character" "character"
HL_RACE_MEXICAN HL_RACE_PUERTO_RICAN
"character" "character"
HL_RACE_CUBAN HL_RACE_OTHER_HISPANIC_LATINO
"character" "character"
HL_NOT_HISPANIC_LATINO_TOTAL_POP HL_NOT_HILA_WHITE_ALONE
"character" "character"
HL_NOT_HILA_BLK_AFR_AMER HL_NOT_HILA_AMER_IND_ALAS_NAT_ALONE
"character" "character"
HL_NOT_HILA_ASIAN_ALONE HL_NOT_HILA_NAT_HAW_OTH_PAC_ISD_ALONE
"character" "character"
HL_NOT_HILA_SOME_OTH_RACE_ALONE HL_NOT_HILA_2_MORE_RACE
"character" "character"
HL_NOT_HILA_2_MORE_INC_SOME_OTH_RACE HL_NOT_HILA_2_MORE_EXC_SOME_OTHE_RACE_3_ MORE
"character" "character"
TOTAL_HOUSING_UNITS CITIZEN_VOTE_AGE_POP_18_ABOVE
"character" "character"
CITIZEN_VOTE_AGE_POP_18_ABOVE_MALE CITIZEN_VOTE_AGE_POP_18_ABOVE_FEMALE
"character" "character"
# Function to convert from character to integer without error handling
convert_to_integer <- function(x) {
as.integer(ifelse(grepl("^-?\\d+$", x), x, NA))
}
# Function to convert from character to numeric without error handling
convert_to_numeric <- function(x) {
as.numeric(ifelse(grepl("^-?\\d+\\.?\\d*$", x), x, NA))
}
# Columns to convert to float
float_columns <- c("SA_TP_SEX_RATIO",
"SA_TP_MEDIAN_AGE",
"SA_TP_18_ABOVE_SEX_RATIO",
"SA_TP_65_ABOVE_SEX_RATIO")
character_columns <- c("GEO_ID", "COUNTY", "STATE")
# Columns to convert to integer
integer_columns <- setdiff(names(main_df), c(float_columns, character_columns))
main_df <- main_df %>%
mutate(across(all_of(float_columns), convert_to_numeric),
across(all_of(integer_columns), convert_to_integer))
print(main_df)
state <- unique(main_df$STATE)
df_state <- data.frame(state)
print(df_state)
df_fips <- read.csv("data/COUNTY-FIPS.csv")
df_fips
# add leading zeros
df_fips <- data.frame(distinct(df_fips))
df_fips$FIPS_CODE <- sprintf("%05d", df_fips$FIPS_CODE)
print(df_fips)
df_state_abbr <- read.csv("data/State-Abbr.csv")
df_state_abbr
df_fips_state <- merge(df_fips, df_state_abbr, by = c("STATE","STATE_ABBR"))
df_fips_state
df_fips_state <- df_fips_state[order(df_fips_state$STATE, df_fips_state$COUNTY), ]
rownames(df_fips_state) <- 1:nrow(df_fips_state)
df_fips_state$COUNTY <- gsub(" County", "", as.character(df_fips_state$COUNTY))
df_fips_state
# write.csv(df_fips_state, "data/County-Fips.csv", row.names = FALSE)
# Trim Preceding or succeeding spaces
main_df$STATE <- trimws(main_df$STATE)
df <- merge(main_df, df_fips_state, by = c("COUNTY", "STATE"))
df
print(sum(is.na(df)))
[1] 29
# Get null values in dataframe
df[!complete.cases(df), ]
df$SA_TP_65_ABOVE_SEX_RATIO[is.na(df$SA_TP_65_ABOVE_SEX_RATIO)] <- 0
print(sum(is.na(df)))
[1] 0
df <- df[order(df$STATE, df$COUNTY), ]
rownames(df) <- 1:nrow(df)
df
’
# write.csv(df, "data/Cleaned_df.csv", row.names = FALSE )
# Check the uncompatible lines
# missing_combinations <- anti_join(main_df, df_fips_state, by = c("COUNTY", "STATE"))
# missing_combinations